packages <- c('sf', 'tidyverse', 'tmap', 'httr', 'jsonlite', 'rvest',
'sp', 'ggpubr', 'corrplot', 'broom', 'olsrr', 'spdep',
'GWmodel', 'devtools', 'rgeos', 'lwgeom', 'maptools', 'matrixStats',
'units', 'gtsummary', 'Metrics', 'rsample', 'SpatialML')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p, repos = "http://cran.us.r-project.org")
}
library(p, character.only = T)
}Take-home Exercise 3: Predicting HDB Public Housing Resale Pricies using Geographically Weighted Methods
1 Overview
1.1 Setting the Scene
Housing is an essential component of household wealth worldwide. Buying a housing has always been a major investment for most people. The price of housing is affected by many factors. Some of them are global in nature such as the general economy of a country or inflation rate. Others can be more specific to the properties themselves. These factors can be further divided to structural and locational factors. Structural factors are variables related to the property themselves such as the size, fitting, and tenure of the property. Locational factors are variables related to the neighbourhood of the properties such as proximity to childcare centre, public transport service and shopping centre.
Conventional, housing resale prices predictive models were built by using Ordinary Least Square (OLS) method. However, this method failed to take into consideration that spatial autocorrelation and spatial heterogeneity exist in geographic data sets such as housing transactions. With the existence of spatial autocorrelation, the OLS estimation of predictive housing resale pricing models could lead to biased, inconsistent, or inefficient results (Anselin 1998). In view of this limitation, Geographical Weighted Models were introduced for calibrating predictive model for housing resale prices.
1.2 The Task
In this take-home exercise, you are tasked to predict HDB resale prices at the sub-market level (i.e. HDB 3-room, HDB 4-room and HDB 5-room) for the month of January and February 2023 in Singapore. The predictive models must be built by using by using conventional OLS method and GWR methods. You are also required to compare the performance of the conventional OLS method versus the geographical weighted methods.
1.3 The Data
For the purpose of this take-home exercise, HDB Resale Flat Prices provided by Data.gov.sg should be used as the core data set. The study should focus on either three-room, four-room or five-room flat and transaction period should be from 1st January 2021 to 31st December 2022. The test data should be January and February 2023 resale prices.
| Aspatial Data Set | Source |
|---|---|
| HDB Resale Data | Data.gov.sg |
| Geospatial Data Set | Source |
|---|---|
| Master Plan 2019 Sub-Zone Boundary | Prof. Kam |
| Hawker Centres | OneMap |
| Childcare Centres | OneMap |
| Kindergartens | OneMap |
| NParks Parks and Nature Reserves | OneMap |
| Shopping Malls | Wikipedia; GitHub - ValaryLim |
| Supermarket | Data.gov.sg |
| Eldercare | Data.gov.sg |
| MRT Stations | LTA DataMall |
| Bus Stops | LTA DataMall |
1.4 Acknowledgement
| Name | Source |
|---|---|
| Prof. Kam | Hands-On Ex 8; In-Class Ex 9 |
| Megan Sim | Take-Home Ex 3 |
| Nor Aisyah Binte Ajit | Take-Home Ex 3 |
2 Getting Started
2.1 Import Packages
2.2 Importing Aspatial Data
resale <- read_csv("data/aspatial/resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv")head(resale)# A tibble: 6 × 11
month town flat_…¹ block stree…² store…³ floor…⁴ flat_…⁵ lease…⁶ remai…⁷
<chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <chr>
1 2017-01 ANG MO … 2 ROOM 406 ANG MO… 10 TO … 44 Improv… 1979 61 yea…
2 2017-01 ANG MO … 3 ROOM 108 ANG MO… 01 TO … 67 New Ge… 1978 60 yea…
3 2017-01 ANG MO … 3 ROOM 602 ANG MO… 01 TO … 67 New Ge… 1980 62 yea…
4 2017-01 ANG MO … 3 ROOM 465 ANG MO… 04 TO … 68 New Ge… 1980 62 yea…
5 2017-01 ANG MO … 3 ROOM 601 ANG MO… 01 TO … 67 New Ge… 1980 62 yea…
6 2017-01 ANG MO … 3 ROOM 150 ANG MO… 01 TO … 68 New Ge… 1981 63 yea…
# … with 1 more variable: resale_price <dbl>, and abbreviated variable names
# ¹flat_type, ²street_name, ³storey_range, ⁴floor_area_sqm, ⁵flat_model,
# ⁶lease_commence_date, ⁷remaining_lease
2.2.1 Filter Resale Data
Resale Data filter includes both training data and test data set
resale_filter <- filter(resale, flat_type == "4 ROOM") %>%
filter(month >= "2021-01" & month <= "2023-02")2.2.1.1 Glimpse Filtered Resale Data
glimpse(resale_filter)Rows: 25,502
Columns: 11
$ month <chr> "2021-01", "2021-01", "2021-01", "2021-01", "2021-…
$ town <chr> "ANG MO KIO", "ANG MO KIO", "ANG MO KIO", "ANG MO …
$ flat_type <chr> "4 ROOM", "4 ROOM", "4 ROOM", "4 ROOM", "4 ROOM", …
$ block <chr> "547", "414", "509", "467", "571", "134", "204", "…
$ street_name <chr> "ANG MO KIO AVE 10", "ANG MO KIO AVE 10", "ANG MO …
$ storey_range <chr> "04 TO 06", "01 TO 03", "01 TO 03", "07 TO 09", "0…
$ floor_area_sqm <dbl> 92, 92, 91, 92, 92, 98, 92, 92, 92, 92, 92, 109, 9…
$ flat_model <chr> "New Generation", "New Generation", "New Generatio…
$ lease_commence_date <dbl> 1981, 1979, 1980, 1979, 1979, 1978, 1977, 1978, 19…
$ remaining_lease <chr> "59 years", "57 years 09 months", "58 years 06 mon…
$ resale_price <dbl> 370000, 375000, 380000, 385000, 410000, 410000, 41…
2.2.1.2 Check for Correct Time Period
unique(resale_filter$month) [1] "2021-01" "2021-02" "2021-03" "2021-04" "2021-05" "2021-06" "2021-07"
[8] "2021-08" "2021-09" "2021-10" "2021-11" "2021-12" "2022-01" "2022-02"
[15] "2022-03" "2022-04" "2022-05" "2022-06" "2022-07" "2022-08" "2022-09"
[22] "2022-10" "2022-11" "2022-12" "2023-01" "2023-02"
2.2.1.3 Check for Correct Flat Type
unique(resale_filter$flat_type)[1] "4 ROOM"
2.2.2 Transform Resale Data
2.2.2.1 Adding New Columns
resale_transform <- resale_filter %>%
mutate(resale_filter, address = paste(block,street_name)) %>%
mutate(resale_filter, remaining_lease_yr = as.integer(str_sub(remaining_lease, 0, 2))) %>%
mutate(resale_filter, remaining_lease_mth = as.integer(str_sub(remaining_lease, 9, 11)))2.2.2.2 Replace NA Values in “remaining_lease_mth”
resale_transform$remaining_lease_mth[is.na(resale_transform$remaining_lease_mth)] <- 02.2.2.3 Convert “remaining_lease_yr” to Months
resale_transform$remaining_lease_yr <- resale_transform$remaining_lease_yr * 12
resale_transform <- resale_transform %>%
mutate(resale_transform, remaining_lease_mths = rowSums(resale_transform[, c("remaining_lease_yr", "remaining_lease_mth")])) %>%
select(month, town, address, block, street_name, flat_type, storey_range, floor_area_sqm, flat_model,
lease_commence_date, remaining_lease_mths, resale_price)2.2.3 Getting Unique Addresses
address <- sort(unique(resale_transform$address))head(address)[1] "1 CHAI CHEE RD" "1 PINE CL" "1 ST. GEORGE'S RD"
[4] "1 TECK WHYE AVE" "1 TOH YI DR" "10 CHAI CHEE RD"
2.2.4 Getting LAT & LONG from OneMap.sg API
get_coords <- function(add_list){
# Create a data frame to store all retrieved coordinates
postal_coords <- data.frame()
for (i in add_list){
#print(i)
r <- GET('https://developers.onemap.sg/commonapi/search?',
query=list(searchVal=i,
returnGeom='Y',
getAddrDetails='Y'))
data <- fromJSON(rawToChar(r$content))
found <- data$found
res <- data$results
# Create a new data frame for each address
new_row <- data.frame()
# If single result, append
if (found == 1){
postal <- res$POSTAL
lat <- res$LATITUDE
lng <- res$LONGITUDE
new_row <- data.frame(address= i, postal = postal, latitude = lat, longitude = lng)
}
# If multiple results, drop NIL and append top 1
else if (found > 1){
# Remove those with NIL as postal
res_sub <- res[res$POSTAL != "NIL", ]
# Set as NA first if no Postal
if (nrow(res_sub) == 0) {
new_row <- data.frame(address= i, postal = NA, latitude = NA, longitude = NA)
}
else{
top1 <- head(res_sub, n = 1)
postal <- top1$POSTAL
lat <- top1$LATITUDE
lng <- top1$LONGITUDE
new_row <- data.frame(address= i, postal = postal, latitude = lat, longitude = lng)
}
}
else {
new_row <- data.frame(address= i, postal = NA, latitude = NA, longitude = NA)
}
# Add the row
postal_coords <- rbind(postal_coords, new_row)
}
return(postal_coords)
}2.2.4.1 Calling Function
latlong <- get_coords(address)2.2.4.2 Check for Missing Value
latlong[(is.na(latlong))]character(0)
2.2.5 Combine Resale and LAT & LONG
resale_latlong <- left_join(resale_transform, latlong, by = c('address' = 'address'))head(resale_latlong)# A tibble: 6 × 15
month town address block stree…¹ flat_…² store…³ floor…⁴ flat_…⁵ lease…⁶
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
1 2021-01 ANG MO … 547 AN… 547 ANG MO… 4 ROOM 04 TO … 92 New Ge… 1981
2 2021-01 ANG MO … 414 AN… 414 ANG MO… 4 ROOM 01 TO … 92 New Ge… 1979
3 2021-01 ANG MO … 509 AN… 509 ANG MO… 4 ROOM 01 TO … 91 New Ge… 1980
4 2021-01 ANG MO … 467 AN… 467 ANG MO… 4 ROOM 07 TO … 92 New Ge… 1979
5 2021-01 ANG MO … 571 AN… 571 ANG MO… 4 ROOM 07 TO … 92 New Ge… 1979
6 2021-01 ANG MO … 134 AN… 134 ANG MO… 4 ROOM 07 TO … 98 New Ge… 1978
# … with 5 more variables: remaining_lease_mths <dbl>, resale_price <dbl>,
# postal <chr>, latitude <chr>, longitude <chr>, and abbreviated variable
# names ¹street_name, ²flat_type, ³storey_range, ⁴floor_area_sqm,
# ⁵flat_model, ⁶lease_commence_date
2.2.6 Check for NA
resale_latlong[(is.na(resale_latlong))]<unspecified> [0]
2.2.7 Write File to rds
resale_latlong.rds <- write_rds(resale_latlong, "data/model/resale_latlong.rds")2.2.8 Read resale_rds File
resale_main <- read_rds("data/model/resale_latlong.rds")head(resale_main)# A tibble: 6 × 15
month town address block stree…¹ flat_…² store…³ floor…⁴ flat_…⁵ lease…⁶
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
1 2021-01 ANG MO … 547 AN… 547 ANG MO… 4 ROOM 04 TO … 92 New Ge… 1981
2 2021-01 ANG MO … 414 AN… 414 ANG MO… 4 ROOM 01 TO … 92 New Ge… 1979
3 2021-01 ANG MO … 509 AN… 509 ANG MO… 4 ROOM 01 TO … 91 New Ge… 1980
4 2021-01 ANG MO … 467 AN… 467 ANG MO… 4 ROOM 07 TO … 92 New Ge… 1979
5 2021-01 ANG MO … 571 AN… 571 ANG MO… 4 ROOM 07 TO … 92 New Ge… 1979
6 2021-01 ANG MO … 134 AN… 134 ANG MO… 4 ROOM 07 TO … 98 New Ge… 1978
# … with 5 more variables: remaining_lease_mths <dbl>, resale_price <dbl>,
# postal <chr>, latitude <chr>, longitude <chr>, and abbreviated variable
# names ¹street_name, ²flat_type, ³storey_range, ⁴floor_area_sqm,
# ⁵flat_model, ⁶lease_commence_date
2.2.9 Transform to sf and Assign CRS
resale_main_sf <- st_as_sf(resale_main,
coords = c("longitude",
"latitude"),
crs=4326) %>%
st_transform(crs = 3414)st_crs(resale_main_sf)Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
2.2.10 Check for Invalid Geometries
length(which(st_is_valid(resale_main_sf) == FALSE))[1] 0
2.3 Importing Geospatial Data
Importing Master Plan 2019 Subzone
mpsz <- st_read(dsn = "data/geospatial", layer = "MPSZ-2019")Reading layer `MPSZ-2019' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 332 features and 6 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 103.6057 ymin: 1.158699 xmax: 104.0885 ymax: 1.470775
Geodetic CRS: WGS 84
Check for Invalid Geometry
length(which(st_is_valid(mpsz) == FALSE))[1] 6
Make Geometry Valid and Check for Invalid Geometry Again
mpsz <- st_make_valid(mpsz)
length(which(st_is_valid(mpsz) == FALSE))[1] 0
Assign ESPG Code
mpsz <- st_transform(mpsz, 3414)
st_crs(mpsz)Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
2.3.1 Data With LAT & LONG
2.3.1.1 Elderly Care
elder_sf <- st_read(dsn = "data/geospatial", layer = "ELDERCARE")Reading layer `ELDERCARE' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 133 features and 18 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 14481.92 ymin: 28218.43 xmax: 41665.14 ymax: 46804.9
Projected CRS: SVY21
# Assign EPSG Code
elder_sf <- st_transform(elder_sf, 3414)2.3.1.2 Bus Stops
BusStop_sf <- st_read(dsn = "data/geospatial", layer = "BusStop")Reading layer `BusStop' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 5159 features and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 3970.122 ymin: 26482.1 xmax: 48284.56 ymax: 52983.82
Projected CRS: SVY21
# Assign EPSG Code
BusStop_sf <- st_transform(BusStop_sf, 3414)2.3.1.3 MRT Stations
mrt <- read.csv("data/geospatial/mrtsg.csv")
mrt_sf <- st_as_sf(mrt,
coords = c("Longitude",
"Latitude"),
crs = 4326) %>%
st_transform(crs = 3414)2.3.1.4 Childcare Center
childcare_sf <- st_read(dsn = "data/geospatial", layer = "CHILDCARE")Reading layer `CHILDCARE' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 1545 features and 15 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 11203.01 ymin: 25667.6 xmax: 45404.24 ymax: 49300.88
Projected CRS: WGS_1984_Transverse_Mercator
# Assign EPSG Code
childcare_sf <- st_transform(childcare_sf, 3414)2.3.1.5 Kindergarten
kindergarten_sf <- st_read(dsn = "data/geospatial", layer = "KINDERGARTENS")Reading layer `KINDERGARTENS' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 448 features and 15 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 11909.7 ymin: 25596.33 xmax: 43395.47 ymax: 48562.06
Projected CRS: SVY21
# Assign EPSG Code
kindergarten_sf <- st_transform(kindergarten_sf, 3414)2.3.1.6 Parks
parks_sf <- st_read(dsn = "data/geospatial", layer = "NATIONALPARKS")Reading layer `NATIONALPARKS' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 352 features and 15 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 12373.11 ymin: 21869.93 xmax: 46735.95 ymax: 49231.09
Projected CRS: SVY21
# Assign EPSG Code
parks_sf <- st_transform(parks_sf, 3414)2.3.1.6 Hawker Centre
hawker_sf <- st_read(dsn = "data/geospatial", layer = "HAWKERCENTRE")Reading layer `HAWKERCENTRE' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 125 features and 21 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 12874.19 ymin: 28355.97 xmax: 45241.4 ymax: 47872.53
Projected CRS: SVY21
# Assign EPSG Code
hawker_sf <- st_transform(hawker_sf, 3414)2.3.1.7 Supermarkets
supermarket_sf <- st_read(dsn = "data/geospatial", layer = "SUPERMARKETS")Reading layer `SUPERMARKETS' from data source
`C:\Users\la935\Desktop\IS415 - GAA\IS415 - GAA (New)\Take-Home_Ex\Take-Home_Ex03\data\geospatial'
using driver `ESRI Shapefile'
Simple feature collection with 526 features and 8 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 4901.188 ymin: 25529.08 xmax: 46948.22 ymax: 49233.6
Projected CRS: SVY21
# Assign EPSG Code
supermarket_sf <- st_transform(supermarket_sf, 3414)2.3.2 Data Without LAT & LONG
2.3.2.1 CBD (Downtown Core, Singapore)
# Storing LAT & LONG for CBD as Dataframe
name <- c('CBD')
latitude = c(1.287953)
longitude = c(103.851784)
cbd <- data.frame(name, latitude, longitude)# Convert to sf and Assign EPSG
cbd_sf <- st_as_sf(cbd,
coords = c("longitude",
"latitude"),
crs = 4326) %>%
st_transform(crs = 3414)
st_crs(cbd_sf)Coordinate Reference System:
User input: EPSG:3414
wkt:
PROJCRS["SVY21 / Singapore TM",
BASEGEOGCRS["SVY21",
DATUM["SVY21",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4757]],
CONVERSION["Singapore Transverse Mercator",
METHOD["Transverse Mercator",
ID["EPSG",9807]],
PARAMETER["Latitude of natural origin",1.36666666666667,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8801]],
PARAMETER["Longitude of natural origin",103.833333333333,
ANGLEUNIT["degree",0.0174532925199433],
ID["EPSG",8802]],
PARAMETER["Scale factor at natural origin",1,
SCALEUNIT["unity",1],
ID["EPSG",8805]],
PARAMETER["False easting",28001.642,
LENGTHUNIT["metre",1],
ID["EPSG",8806]],
PARAMETER["False northing",38744.572,
LENGTHUNIT["metre",1],
ID["EPSG",8807]]],
CS[Cartesian,2],
AXIS["northing (N)",north,
ORDER[1],
LENGTHUNIT["metre",1]],
AXIS["easting (E)",east,
ORDER[2],
LENGTHUNIT["metre",1]],
USAGE[
SCOPE["Cadastre, engineering survey, topographic mapping."],
AREA["Singapore - onshore and offshore."],
BBOX[1.13,103.59,1.47,104.07]],
ID["EPSG",3414]]
2.3.2.2 Primary Schools
primary <- read.csv("data/geospatial/general-information-of-schools.csv")
primary <- primary %>%
filter(mainlevel_code == "PRIMARY") %>%
select(school_name, address, postal_code, mainlevel_code)
glimpse(primary)Rows: 183
Columns: 4
$ school_name <chr> "ADMIRALTY PRIMARY SCHOOL", "AHMAD IBRAHIM PRIMARY SCHO…
$ address <chr> "11 WOODLANDS CIRCLE", "10 YISHUN STREET 11", "100 …
$ postal_code <int> 738907, 768643, 579646, 159016, 544969, 569785, 569920,…
$ mainlevel_code <chr> "PRIMARY", "PRIMARY", "PRIMARY", "PRIMARY", "PRIMARY", …
Converting postal codes to LAT & LONG
# Store Primary School Postal and Retrieve LAT & LONG
primary_postal <- unique(primary$postal_code)
primary_latlong <- get_coords(primary_postal)Check for NA
primary_latlong[(is.na(primary_latlong))][1] NA NA NA NA NA NA NA NA NA
Error found for postal with 0 at the start, replace the postal code with 0 at the front
primary$postal_code[primary$postal_code == '88256'] <- '088256'
primary$postal_code[primary$postal_code == '99757'] <- '099757'
primary$postal_code[primary$postal_code == '99840'] <- '099840'Re-run converting postal code to LAT & LONG
primary_postal <- unique(primary$postal_code)
primary_latlong <- get_coords(primary_postal)Check for NA
primary_latlong[(is.na(primary_latlong))]character(0)
Combine LAT & LONG with primary file
primary_school <- left_join(primary, primary_latlong, by = c('postal_code' = 'postal'))Convert to sf object and transform CRS
primary_school_sf <- st_as_sf(primary_school,
coords = c("longitude",
"latitude"),
crs = 4326) %>%
st_transform(crs = 3414)2.3.2.3 Good Primary Schools
Got the top 10 schools based on a website
good_primary_school <- primary_school %>%
filter(school_name %in%
c("PEI HWA PRESBYTERIAN PRIMARY SCHOOL",
"GONGSHANG PRIMARY SCHOOL",
"RIVERSIDE PRIMARY SCHOOL",
"RED SWASTIKA SCHOOL",
"PUNGGOL GREEN PRIMARY SCHOOL",
"PRINCESS ELIZABETH PRIMARY SCHOOL",
"WESTWOOD PRIMARY SCHOOL",
"AI TONG SCHOOL",
"FRONTIER PRIMARY SCHOOL",
"OASIS PRIMARY SCHOOL"))Convert to sf object and transform CRS
good_primary_school_sf <- st_as_sf(good_primary_school,
coords = c("longitude",
"latitude"),
crs = 4326) %>%
st_transform(crs = 3414)2.3.2.4 Shopping Malls
shopping <- read.csv("data/geospatial/mall_coordinates.csv")
shopping <- shopping %>%
select(name, latitude, longitude)
glimpse(shopping)Rows: 199
Columns: 3
$ name <chr> "100 AM", "i12 Katong", "313@SOMERSET", "321 CLEMENTI", "600…
$ latitude <dbl> 1.274588, 1.305087, 1.301385, 1.312025, 1.334042, 1.437131, …
$ longitude <dbl> 103.8435, 103.9051, 103.8377, 103.7650, 103.8510, 103.7953, …
shopping_sf <- st_as_sf(shopping,
coords = c("longitude",
"latitude"),
crs = 4326) %>%
st_transform(crs = 3414)3 Proximity Calculation
3.1 Calculation Function
prox_cal <- function(df1, df2, col_name) {
dist_matrix <- st_distance(df1, df2)
df1[,col_name] <- rowMins(dist_matrix) / 1000
return(df1)
}3.2 Calculation of Locational Factors
resale_main_sf <- prox_cal(resale_main_sf, cbd_sf, "PROX_CBD")
resale_main_sf <- prox_cal(resale_main_sf, BusStop_sf, "PROX_BUS")
resale_main_sf <- prox_cal(resale_main_sf, childcare_sf, "PROX_CHILDCARE")
resale_main_sf <- prox_cal(resale_main_sf, elder_sf, "PROX_ELDERCARE")
resale_main_sf <- prox_cal(resale_main_sf, hawker_sf, "PROX_HAWKER")
resale_main_sf <- prox_cal(resale_main_sf, good_primary_school_sf, "PROX_GOODPRIMARY")
resale_main_sf <- prox_cal(resale_main_sf, parks_sf, "PROX_PARK")
resale_main_sf <- prox_cal(resale_main_sf, supermarket_sf, "PROX_SUPERMARKET")
resale_main_sf <- prox_cal(resale_main_sf, shopping_sf, "PROX_SHOPPING")
resale_main_sf <- prox_cal(resale_main_sf, mrt_sf, "PROX_MRT")3.3 Calculation Function With Radius
prox_cal_radius <- function(df1, df2, col_name, radius) {
dist_matrix <- st_distance(df1, df2) %>%
drop_units() %>%
as.data.frame()
df1[,col_name] <- rowSums(dist_matrix <= radius)
return(df1)
}3.4 Calculation of Locational Factors With Radius
resale_main_sf <- prox_cal_radius(resale_main_sf, kindergarten_sf, "WITHIN_350M_KINDERGARTEN", 350)
resale_main_sf <- prox_cal_radius(resale_main_sf, childcare_sf, "WITHIN_350M_CHILDCARE", 350)
resale_main_sf <- prox_cal_radius(resale_main_sf, BusStop_sf, "WITHIN_350M_BUS", 350)
resale_main_sf <- prox_cal_radius(resale_main_sf, primary_school_sf, "WITHIN_1KM_PRIMARY", 1000)3.5 Saving Dataset
resale_main_sf <- resale_main_sf %>%
mutate() %>%
rename("AREA_SQM" = "floor_area_sqm",
"LEASE_MTHS" = "remaining_lease_mths",
"PRICE" = "resale_price",
"STOREY" = "storey_range")resale_main.rds <- write_rds(resale_main_sf, "data/model/resale_main.rds")4 Exploratory Data Analysis (EDA)
4.1 Read resale_main_rds file
resale_main_sf <- read_rds("data/model/resale_main.rds")glimpse(resale_main_sf)Rows: 25,502
Columns: 28
$ month <chr> "2021-01", "2021-01", "2021-01", "2021-01", "…
$ town <chr> "ANG MO KIO", "ANG MO KIO", "ANG MO KIO", "AN…
$ address <chr> "547 ANG MO KIO AVE 10", "414 ANG MO KIO AVE …
$ block <chr> "547", "414", "509", "467", "571", "134", "20…
$ street_name <chr> "ANG MO KIO AVE 10", "ANG MO KIO AVE 10", "AN…
$ flat_type <chr> "4 ROOM", "4 ROOM", "4 ROOM", "4 ROOM", "4 RO…
$ STOREY <chr> "04 TO 06", "01 TO 03", "01 TO 03", "07 TO 09…
$ AREA_SQM <dbl> 92, 92, 91, 92, 92, 98, 92, 92, 92, 92, 92, 1…
$ flat_model <chr> "New Generation", "New Generation", "New Gene…
$ lease_commence_date <dbl> 1981, 1979, 1980, 1979, 1979, 1978, 1977, 197…
$ LEASE_MTHS <dbl> 708, 693, 702, 695, 689, 681, 661, 682, 692, …
$ PRICE <dbl> 370000, 375000, 380000, 385000, 410000, 41000…
$ postal <chr> "560547", "560414", "560509", "560467", "5605…
$ geometry <POINT [m]> POINT (30770.07 39578.64), POINT (30292…
$ PROX_CBD <dbl> 9.564575, 8.401690, 9.516492, 8.580908, 9.084…
$ PROX_BUS <dbl> 0.16157609, 0.16740841, 0.07424143, 0.0887911…
$ PROX_CHILDCARE <dbl> 2.493662e-01, 6.715056e-02, 1.385583e-01, 1.4…
$ PROX_ELDERCARE <dbl> 1.08567795, 0.15039052, 0.72242472, 0.0981628…
$ PROX_HAWKER <dbl> 0.4442515, 0.2050009, 0.4495734, 0.3190679, 0…
$ PROX_GOODPRIMARY <dbl> 3.182527, 2.354024, 2.414729, 2.699653, 2.648…
$ PROX_PARK <dbl> 0.8291527, 0.7847864, 0.3796058, 0.9242129, 0…
$ PROX_SUPERMARKET <dbl> 0.4184204, 0.1946009, 0.4435109, 0.4269715, 0…
$ PROX_SHOPPING <dbl> 1.1817959, 0.8444986, 0.2966736, 0.9304149, 0…
$ PROX_MRT <dbl> 1.0731215, 0.8245176, 0.4544926, 0.9503956, 0…
$ WITHIN_350M_KINDERGARTEN <dbl> 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, …
$ WITHIN_350M_CHILDCARE <dbl> 2, 3, 3, 3, 3, 2, 6, 3, 3, 3, 3, 3, 5, 2, 3, …
$ WITHIN_350M_BUS <dbl> 4, 7, 10, 4, 8, 2, 8, 7, 6, 7, 7, 7, 8, 8, 11…
$ WITHIN_1KM_PRIMARY <dbl> 1, 3, 2, 3, 2, 2, 3, 2, 3, 3, 1, 2, 3, 2, 2, …
4.2 Statistical Graphics
4.2.1 Histogram - Distribution of 4-Room Resale Prices
ggplot(data = resale_main_sf, aes(x = `PRICE`)) +
geom_histogram(bins = 20, color = "black", fill = "light green") +
labs(title = "Distribution of 4-Room Resale Prices",
x = "Resale Prices",
y = "Frequency")
Based on the above graph, the histogram is skewed towards the right, meaning that 4-room HDB flats were sold at relatively lower prices.
4.2.2 Boxplots - Distribution of 4-Room Resale Prices
ggplot(data = resale_main_sf, aes(x = '', y = PRICE)) +
geom_boxplot() +
labs(x = '', y = 'Resale Prices')
summary(resale_main_sf$PRICE) Min. 1st Qu. Median Mean 3rd Qu. Max.
250000 445000 495000 529142 570000 1370000
Min. 1st Qu. Median Mean 3rd Qu. Max.
250000 445000 495000 529142 570000 1370000
From the graph above, it is clear that there’s a number of outliers at the higher end, and there’s one at the lower end. Most 4-rooms are sold between $445,000 to $570,000, with the lowest sold at $250,000 and the highest at $1,370,000.
4.2.3 Multiple Histogram Plots Distribution of Locational Factors
AREA_SQM <- ggplot(data = resale_main_sf, aes(x = `AREA_SQM`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
LEASE_MTHS <- ggplot(data = resale_main_sf, aes(x = `LEASE_MTHS`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_CBD <- ggplot(data = resale_main_sf, aes(x = `PROX_CBD`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_BUS <- ggplot(data = resale_main_sf, aes(x = `PROX_BUS`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_CHILDCARE <- ggplot(data = resale_main_sf, aes(x = `PROX_CHILDCARE`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_ELDERCARE <- ggplot(data = resale_main_sf, aes(x = `PROX_ELDERCARE`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_HAWKER <- ggplot(data = resale_main_sf, aes(x = `PROX_HAWKER`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_GOODPRIMARY <- ggplot(data = resale_main_sf, aes(x = `PROX_GOODPRIMARY`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_PARK <- ggplot(data = resale_main_sf, aes(x = `PROX_PARK`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_SUPERMARKET <- ggplot(data = resale_main_sf, aes(x = `PROX_SUPERMARKET`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_SHOPPING <- ggplot(data = resale_main_sf, aes(x = `PROX_SHOPPING`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
PROX_MRT <- ggplot(data = resale_main_sf, aes(x = `PROX_MRT`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
ggarrange(AREA_SQM, LEASE_MTHS, PROX_CBD, PROX_BUS, PROX_CHILDCARE, PROX_ELDERCARE, PROX_HAWKER, PROX_GOODPRIMARY, PROX_PARK, PROX_SUPERMARKET, PROX_SHOPPING, PROX_MRT, ncol = 3, nrow = 4)
4.2.3 Multiple Histogram Plots Distribution of Locational Factors With Radius
WITHIN_350M_KINDERGARTEN <- ggplot(data = resale_main_sf, aes(x = `WITHIN_350M_KINDERGARTEN`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
WITHIN_350M_CHILDCARE <- ggplot(data = resale_main_sf, aes(x = `WITHIN_350M_CHILDCARE`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
WITHIN_350M_BUS <- ggplot(data = resale_main_sf, aes(x = `WITHIN_350M_BUS`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
WITHIN_1KM_PRIMARY <- ggplot(data = resale_main_sf, aes(x = `WITHIN_1KM_PRIMARY`)) +
geom_histogram(bins = 20, color = "black", fill = "light green")
ggarrange(WITHIN_350M_KINDERGARTEN, WITHIN_350M_CHILDCARE, WITHIN_350M_BUS, WITHIN_1KM_PRIMARY, ncol = 2, nrow = 2)
4.2.4 Point Map
tmap_mode("view")
tmap_options(check.and.fix = TRUE)
tm_shape(resale_main_sf)+
tm_dots(col = "PRICE",
alpha = 0.6,
style = "quantile",
popup.vars=c("block"="block", "street_name"="street_name", "flat_model" = "flat_model", "town" = "town", "PRICE" = "PRICE", "LEASE_MTHS", "LEASE_MTHS")) +
tm_view(set.zoom.limits = c(11, 16))